Expand code block to see libraries and functions used.
library(dplyr)
library(data.table)
library(stringr)
library(randomForest)
library(lme4)
library(gbm)
library(dismo)
library(ROCR)
library(PresenceAbsence) # for Kappa and PCC
library(ggplot2)
get.pred <- function(modobj, modtype="GLM"){
if(modtype=="RF"){
p_rf.mod <- predict(modobj, type = "prob")
pred.mod <- prediction(p_rf.mod[,2], modobj$y)
} else if(modtype=="BRT") {
p_brt.mod <- predict(modobj, type = "response")
pred.mod <- prediction(p_brt.mod, modobj$data$y)
} else {
pred.mod <- prediction(modobj@resp$mu, modobj@resp$y)
}
return(pred.mod)
}
ConfMatrix <- function(predRDS, thresh, column=1){
#Presence.Absence insists on its own data structure
padat <- data.frame(ID=seq(1:length(predRDS@predictions[[1]])),
OBS=as.numeric(levels(predRDS@labels[[column]]))[predRDS@labels[[column]]],
PRED=predRDS@predictions[[column]])
pacmx <- cmx(padat, threshold = thresh)
return(pacmx)
}
#https://github.com/brendano/dlanalysis/blob/master/util.R
linelight <- function(x,y, lty='dashed', col='lightgray', ...) {
# highlight a point with lines running to the axes.
left = par('usr')[1]
bot = par('usr')[3]
segments(left,y, x,y, lty=lty, col=col, ...)
segments(x,bot, x,y, lty=lty, col=col, ...)
}These maps are displayed using a linear min to max stretch, green (high) to white (low) color ramp.
GLMER
RF
BRT
Ensemble
testOut <- fread(paste0(pth, "Testing15pct_evalMetrics.csv"))
cvTable <- fread(paste0(pth, "CrossValidation_metricsMay14.csv"))
tograph <- melt(cvTable, id.vars = c("cv", "model"), measure.vars = c(2:9), variable.name = "metric")
toadd <- melt(testOut, id.vars = "Name", measure.vars = c(2:9), variable.name = "metric")ggplot(data=tograph, aes(x=factor(metric), y=value)) +
geom_boxplot(aes(fill=factor(model))) +
ylim(c(0,1)) +
labs(x="Performance Metric", y="",
title="10-fold Cross-Validation Plus Testing Data, Sensitivity=0.95") +
geom_point(data=toadd, aes(x=factor(metric),y=value,
color=factor(Name)),
size=4, pch=18) +
guides(fill=guide_legend(title = "10fold Training"),
color=guide_legend(title = "Testing data"))kable(testOut, format = "html", row.names = FALSE, digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec(2:9, color="black") %>% column_spec(1, bold=TRUE)| Name | AUC | TSS | err_rate | kappa | PCC | Sensitivity | Specificity | Threshold |
|---|---|---|---|---|---|---|---|---|
| GLM_test15pct | 0.983 | 0.895 | 0.053 | 0.895 | 0.947 | 0.95 | 0.945 | 0.663 |
| RF_test15pct | 0.978 | 0.846 | 0.077 | 0.846 | 0.923 | 0.95 | 0.896 | 0.495 |
| BRT_test15pct | 0.940 | 0.728 | 0.136 | 0.728 | 0.864 | 0.95 | 0.778 | 0.340 |
| EN_test15pct | 0.965 | 0.797 | 0.102 | 0.797 | 0.898 | 0.95 | 0.847 | 0.335 |
predAll <- readRDS(paste0(pth, "predictions15pct_allMod.rds"))
perf.roc <- performance(predAll, "tpr", "fpr")
perf.err <- performance(predAll, "err")
perf.flip <- performance(predAll, "tnr", "fnr")
perf.PA <- performance(predAll, "prec", "acc")Gray solid lines indicate random performance.
Dashed lines shows the the values the axes measure for the Ensemble at the Sensitivity = 0.95 threshold (0.3352343).
trng <- c(as.numeric(testOut[4,9])-0.000001,
as.numeric(testOut[4,9])+0.000001)
i <- which((perf.roc@alpha.values[[4]] > trng[1]) &
(perf.roc@alpha.values[[4]] < trng[2]))
# 1) Red, 2) Green, 3) Cyan, 4) Purple
plot(perf.roc, col=as.list(rainbow(4)),
main="ROC for the 4 models",
xlim=c(0,1))
linelight(x=perf.roc@x.values[[4]][i], y=perf.roc@y.values[[4]][i],
col="plum3")
abline(a = 0, b = 1, col = "gray")
legend("right", c("GLM", "RF", "BRT", "EN"), lty = "solid",
col = rainbow(4), bty = "n")
plot(perf.err, col=as.list(rainbow(4)),
main="Error Rate vs. Threshold Cutoff",
xlim=c(0,1), ylim=c(0,1))
linelight(x=as.numeric(testOut[4,9]),
y=as.numeric(testOut[4,4]), col="plum3")
lines(x=c(0,1), y=c(0.5,0.5), col="gray")
legend("top", c("GLM", "RF", "BRT", "EN"), lty = "solid",
col = rainbow(4), bty = "n")
i <- which((perf.flip@alpha.values[[4]] > trng[1]) &
(perf.flip@alpha.values[[4]] < trng[2]))
plot(perf.flip, col=as.list(rainbow(4)),
main="Flip-side of the ROC")
abline(a = 0, b = 1, col = "gray")
linelight(x=perf.flip@x.values[[4]][i],
y=perf.flip@y.values[[4]][i],
col="plum3")
legend("right", c("GLM", "RF", "BRT", "EN"), lty = "solid",
col = rainbow(4), bty = "n")
i <- which((perf.PA@alpha.values[[4]] > trng[1]) &
(perf.PA@alpha.values[[4]] < trng[2]))
plot(perf.PA, col=as.list(rainbow(4)),
main="Precision vs. Accuracy",
xlim=c(0.5,1), ylim=c(0.5,1))
linelight(x=perf.PA@x.values[[4]][i],
y=perf.PA@y.values[[4]][i],
col="plum3")
legend("left", c("GLM", "RF", "BRT", "EN"), lty = "solid",
col = rainbow(4), bty = "n")Accuracy: \(P(Yhat = Y)\). Estimated as: \((TP+TN)/(P+N)\)
Precision: Positive predictive value. \(P(Y = + | Yhat = +)\). Estimated as: \(TP/(TP+FP)\)
These are calculated at Sensitivity = 0.95 and from the Testing data (of which there are 20,900 records).
…and we want them to look like this
cmxG <- ConfMatrix(predAll,
thresh = as.numeric(testOut[1,9]),
column = 1)
#Adapted from https://stackoverflow.com/questions/37897252/plot-confusion-matrix-in-r-using-ggplot
ggplot(data=as.data.frame(cmxG),
aes(x=observed, y=predicted)) +
geom_tile(aes(fill=Freq), color="white") +
geom_text(aes(label=Freq), vjust=1) +
scale_fill_gradientn(breaks=c(500,2500,5000,10000),
colors = hcl.colors(4, "Heat")) +
labs(title = "GLM Confusion Matrix") +
theme_bw() + theme(legend.position = "none")
cmxR <- ConfMatrix(predAll,
thresh = as.numeric(testOut[2,9]),
column = 2)
ggplot(data=as.data.frame(cmxR),
aes(x=observed, y=predicted)) +
geom_tile(aes(fill=Freq), color="white") +
geom_text(aes(label=Freq), vjust=1) +
scale_fill_gradientn(breaks=c(500,2500,5000,10000),
colors = hcl.colors(4, "Heat")) +
labs(title = "RF Confusion Matrix") +
theme_bw() + theme(legend.position = "none")
cmxB <- ConfMatrix(predAll,
thresh = as.numeric(testOut[3,9]),
column = 3)
ggplot(data=as.data.frame(cmxB),
aes(x=observed, y=predicted)) +
geom_tile(aes(fill=Freq), color="white") +
geom_text(aes(label=Freq), vjust=1) +
scale_fill_gradientn(breaks=c(500,2500,5000,10000),
colors = hcl.colors(4, "Heat")) +
labs(title = "BRT Confusion Matrix") +
theme_bw() + theme(legend.position = "none")
cmxE <- ConfMatrix(predAll,
thresh = as.numeric(testOut[4,9]),
column = 4)
ggplot(data=as.data.frame(cmxE),
aes(x=observed, y=predicted)) +
geom_tile(aes(fill=Freq), color="white") +
geom_text(aes(label=Freq), vjust=1) +
scale_fill_gradientn(breaks=c(500,2500,5000,10000),
colors = hcl.colors(4, "Heat")) +
labs(title = "Ensemble Confusion Matrix") +
theme_bw() + theme(legend.position = "none")RF_mod <- readRDS(paste0(pth,"RF_4800trees_May12.rds"))
GLM_mod <- readRDS(paste0(pth,"GLMER_May14_70pct.rds"))
BRT_mod <- readRDS(paste0(pth,"BRT_gbmholdout_50ktMay13.rds"))RF_imp <- as.data.table(RF_mod[["importance"]],
keep.rownames = TRUE)
ginisum <- sum(RF_imp$MeanDecreaseGini)
RF_imp[, standGini := (MeanDecreaseGini / ginisum)]
RF_imp <- setorder(RF_imp, -MeanDecreaseAccuracy)
BRT_imp <- as.data.table(summary(BRT_mod, plotit = FALSE),
keekeep.rownames = FALSE)
relsum <- sum(BRT_imp$rel.inf)
BRT_imp[, standInfl := (rel.inf / relsum)]
GLM_imp <- data.table(var=fixef(GLM_mod),
inputs=names(fixef(GLM_mod)))
GLM_imp[, absval := abs(var)]
varsum <- sum(GLM_imp$absval)
GLM_imp[, standvar := (absval / varsum)]
GLM_imp <- setorder(GLM_imp, -standvar)The values have all been normalized so that the sum of all variable importance measures for a model = 1. These are from the Training data.
ggplot(GLM_imp, aes(x=reorder(inputs, standvar, sum),
y=standvar)) +
geom_col(fill=hcl.colors(nrow(GLM_imp), "Purple-Blue"),
color="black") +
coord_flip() + ylim(c(0,0.31)) +
labs(x="", y="Relative Importance",
title="Generalized Linear Mixed Model")
ggplot(RF_imp, aes(x=reorder(rn, MeanDecreaseAccuracy, sum),
y=MeanDecreaseAccuracy)) +
geom_col(fill=hcl.colors(nrow(RF_imp), "purples"),
color="black") +
coord_flip() + ylim(c(0,0.31)) +
labs(x="", y="Relative Importance",
title="Random Forest")
ggplot(BRT_imp, aes(x=reorder(var, standInfl, sum),
y=standInfl)) +
geom_col(fill=hcl.colors(nrow(BRT_imp), "purples"),
color="black") +
coord_flip() + ylim(c(0,0.31)) +
labs(x="", y="Relative Importance",
title="Boosted Regression Tree")Meh… I cannot figure out any way to get meaningful plots out of a randomForest::randomForest object.
#only works if BRT_mod winds up being a dismo object
BRTinter <- gbm.interactions(BRT_mod, mask.object = BRT_mod)
kable(BRTinter$rank.list, format = "html", row.names = FALSE, digits = 0) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
column_spec(c(2,4), bold=TRUE)| var1.index | var1.names | var2.index | var2.names | int.size |
|---|---|---|---|---|
| 10 | ppt_ws | 9 | ppt_sf | 1327 |
| 9 | ppt_sf | 4 | GDD5 | 1208 |
| 6 | northness | 3 | eastness | 942 |
| 10 | ppt_ws | 4 | GDD5 | 355 |
| 13 | TWI | 1 | bd | 343 |
| 11 | sand | 9 | ppt_sf | 289 |
| 5 | nlcd | 4 | GDD5 | 277 |
| 10 | ppt_ws | 2 | clay | 266 |
gbm.perspec(BRT_mod, x=10, y=9, phi=20, col="lightblue", shade=0.8)
gbm.perspec(BRT_mod, x=9, y=4, phi=20, col="lightblue", shade=0.8)# #below is just the Grid_ID as random effect.
thing <- ranef(GLM_mod)
#dotplot.ranef.mer(thing, data=GLM_mod)
lattice::qqmath(thing, data=GLM_mod) #slightly easier to look at## $Grid_ID
The dots are the mean residuals and the black lines are, I think, the standard error of the residuals for each Grid_ID.
I honestly have no idea what to make of this graph, except that the data isn’t normally distributed…